Setup
library(tidyverse)
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(cowplot)
all_counts <- readRDS(file= "Data/pc_count_matrix.rds")
avg_counts <- readRDS(file="Data/avg_pc_count_matrix.rds")
diff_counts <- readRDS(file="Data/diff_pc_count_matrix.rds")
meta <- suppressMessages(readr::read_tsv("Data/complete_meta_data.tsv"))
meta <- meta %>% mutate(tissue_type=replace(tissue_type, tissue_type=="embryonic facial prominence", "facial prominence"))
meta <- meta %>% mutate(tissue_type=replace(tissue_type, tissue_type=="skeletal muscle tissue", "muscle tissue"))
meta <- meta %>% mutate(tissue_type=replace(tissue_type, tissue_type=="skeletal muscle tissue", "muscle tissue"))
tissue_types <- unique(meta %>% pull(tissue_type))
collapsed_dev_counts <- matrix(data=NA, nrow=nrow(all_counts),ncol=length(tissue_types))
walker <- 1
for (tissue in tissue_types){
tissue_subset_ids <- meta %>% filter(tissue_type == tissue) %>% pull(id)
tissue_subset_count <- all_counts[,tissue_subset_ids]
tissue_avg <- rowMeans(tissue_subset_count)
collapsed_dev_counts[,walker] <- tissue_avg
walker <- walker + 1
}
colnames(collapsed_dev_counts) <- tissue_types
rownames(collapsed_dev_counts) <- rownames(all_counts)
TFs<- c("Ascl1", "Hes1", "Neurod1", "Mecp2", "Mef2c", "Runx1", "Tcf4", "Pax6")
PCA coloured by dev_stage
##Remove zero variance
all_counts_drop_var <- (t(all_counts))[, which(apply(t(all_counts),2,var) !=0)]
all_counts.pca <- prcomp(all_counts_drop_var, center = TRUE,scale. = TRUE)
library(ggfortify)
library(ggrepel)
library(RColorBrewer)
autoplot(all_counts.pca, data=meta, colour="dev_stage", label.size = 3, size=5) + theme_cowplot(12) + scale_color_brewer(palette = "RdYlBu")
PCA coloured by tissue
getPalette = colorRampPalette(brewer.pal(12, "Paired"))
autoplot(all_counts.pca, data=meta, colour="tissue_type", label.size = 3, size=5) + theme_cowplot(12) + scale_color_manual(values = getPalette(length(tissue_types)))
Pearson correlation heatmap
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
all_counts_pear <- all_counts
colnames(all_counts_pear) <- sapply(colnames(all_counts), function(x){temp_row <- meta %>% filter(id==x)
return(paste0(temp_row %>% pull(tissue_type), " ",temp_row %>% pull(dev_stage)))})
pear_rcorr_object <- rcorr(all_counts_pear, type = "pearson")
pear_matrix <- pear_rcorr_object$r
pear_matrix_pvalues <- pear_rcorr_object$P
library(corrplot)
## corrplot 0.90 loaded
corrplot(pear_matrix, method = 'shade', type = 'lower', diag = FALSE, col = rep(rev(brewer.pal(n=8, name="RdYlBu")),2), col.lim=c(0,1))
Pearson correlation heatmap (collapsed by samples)
library(corrplot)
library(Hmisc)
avg_counts_pear <- avg_counts
colnames(avg_counts_pear) <- sapply(colnames(avg_counts), function(x){temp_row <- meta %>% filter(id==x)
return(paste0(temp_row %>% pull(tissue_type), " ",temp_row %>% pull(dev_stage)))})
pear_rcorr_object <- rcorr(avg_counts_pear, type = "pearson")
pear_matrix <- pear_rcorr_object$r
pear_matrix_pvalues <- pear_rcorr_object$P
corrplot(pear_matrix, method="shade", type="lower", diag=FALSE, col = rep(rev(brewer.pal(n=8, name="RdYlBu")),2), col.lim=c(0,1))
Pearson correlation heatmap (collapsed by samples) 17 clusters (because there are 17 tissues)
library(corrplot)
library(Hmisc)
avg_counts_pear <- avg_counts
colnames(avg_counts_pear) <- sapply(colnames(avg_counts), function(x){temp_row <- meta %>% filter(id==x)
return(paste0(temp_row %>% pull(tissue_type), " ",temp_row %>% pull(dev_stage)))})
pear_rcorr_object <- rcorr(avg_counts_pear, type = "pearson")
pear_matrix <- pear_rcorr_object$r
pear_matrix_pvalues <- pear_rcorr_object$P
testRes = cor.mtest(avg_counts_pear, conf.level = 0.95)
corrplot(pear_matrix, order = 'hclust', addrect = 17, method="shade", rect.col = 'black', col = rep(rev(brewer.pal(n=8, name="RdYlBu")),2), col.lim=c(0,1))
Pearson correlation heatmap (collapsed by samples) 8 clusters (because 8 unique dev stages)
library(corrplot)
library(Hmisc)
avg_counts_pear <- avg_counts
colnames(avg_counts_pear) <- sapply(colnames(avg_counts), function(x){temp_row <- meta %>% filter(id==x)
return(paste0(temp_row %>% pull(tissue_type), " ",temp_row %>% pull(dev_stage)))})
pear_rcorr_object <- rcorr(avg_counts_pear, type = "pearson")
pear_matrix <- pear_rcorr_object$r
pear_matrix_pvalues <- pear_rcorr_object$P
testRes = cor.mtest(avg_counts_pear, conf.level = 0.95)
corrplot(pear_matrix, order = 'hclust', addrect = 8, method="shade", rect.col = 'black', col = rep(rev(brewer.pal(n=8, name="RdYlBu")),2), col.lim=c(0,1))
Prepare Limma (with individual comparison)
# Demonstration of using limma to find differentially expressed genes
library(limma)
library(tidyverse)
library(edgeR)
library(tximport)
# load data
meta <- read.delim("Data/complete_meta_data.tsv", stringsAsFactors = FALSE)
pc <- read.delim("Data/pc_sub.tsv", stringsAsFactors = FALSE)
files <- paste0("Data/Count_tables/", meta$id, ".tsv")
counts <- tximport(files, type = "rsem", txIn = FALSE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
## 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
## 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
## 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
## 149 150 151 152 153 154 155 156
count_mat <- counts$counts
# name matrix and only keep protein coding genes
rownames(count_mat) <- str_replace(rownames(count_mat), "\\.[:digit:]*$", "")
colnames(count_mat) <- meta$id
count_mat <- count_mat[rownames(count_mat) %in% pc$Gene_ID, ]
oldrownames <- data.frame(rownames(count_mat))
oldrownames[,"Gene_ID"] <- oldrownames[,1]
rownames(count_mat) <- oldrownames %>% left_join(pc,"Gene_ID") %>% pull(Symbol)
# order by tissue and dev stage
meta <- meta %>% group_by(tissue_type) %>% arrange(dev_stage, .by_group = TRUE)
count_mat <- count_mat[, meta$id]
stopifnot(identical(colnames(count_mat), meta$id))
# build design matrix
design <- model.matrix(~ 0 + meta$dev_stage + meta$tissue_type)
#design <- model.matrix(~ meta$tissue_type * meta$dev_stage)
# edgeR: remove lowly expressed, scale normalize, and the log2 CPM transform
dge <- DGEList(count_mat)
keep <- filterByExpr(dge, design)
dge <- dge[keep, ]
# gene counts of removed
removed <- rowSums(count_mat[!keep,])
summary(removed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 1.00 5.00 24.36 22.41 851.45
count_mat[which.max(removed), ]
## ENCFF465YOS ENCFF516EUX ENCFF929KWG ENCFF672DDJ ENCFF772UWT ENCFF262TXH
## 217 148 29 44 27 22
## ENCFF146HIO ENCFF129XUH ENCFF132NQU ENCFF867TKM ENCFF924CMS ENCFF370UDF
## 30 33 19 17 25 45
## ENCFF369TLJ ENCFF536XKZ ENCFF302TQO ENCFF413BXV ENCFF465SNB ENCFF976OLT
## 24 33 19 12 17 17
## ENCFF393ZAF ENCFF488HRD ENCFF567AFL ENCFF227HKF ENCFF745ZJF ENCFF816CVP
## 16 21 16 13 32 14
## ENCFF763GXJ ENCFF340XFQ ENCFF590FAC ENCFF484AOO ENCFF918QNL ENCFF895JXR
## 23 23 10 17 34 24
## ENCFF127FPD ENCFF721XZC ENCFF226IWR ENCFF540EJL ENCFF125TAY ENCFF824DCQ
## 79 58 81 72 44 38
## ENCFF242GMD ENCFF976CYB ENCFF111IGW ENCFF540BJT ENCFF440PWB ENCFF219PVC
## 81 55 60 63 85 48
## ENCFF415JBI ENCFF871IGQ ENCFF817KPY ENCFF155GNG ENCFF070GIC ENCFF097IDM
## 50 58 60 41 36 42
## ENCFF750FTK ENCFF109HTF ENCFF184ZAV ENCFF684GSP ENCFF131WIM ENCFF604LWF
## 74 53 39 32 26 31
## ENCFF206KRT ENCFF741FZD ENCFF395LAH ENCFF338ZXD ENCFF211ELX ENCFF830YBR
## 43 18 31 30 34 44
## ENCFF798MSP ENCFF861GUP ENCFF959OHE ENCFF228SAS ENCFF052THP ENCFF114YCL
## 32 46 16 28 17 35
## ENCFF443JRH ENCFF278PAQ ENCFF485CJB ENCFF795XBQ ENCFF499WRT ENCFF413OJO
## 23 36 21 21 120 128
## ENCFF700YRC ENCFF347HOK ENCFF752QKG ENCFF143OJZ ENCFF905HUL ENCFF783LVC
## 122 129 120 162 257 326
## ENCFF399HJB ENCFF597VHC ENCFF195JHC ENCFF457ZGF ENCFF982ERQ ENCFF449QSP
## 22 51 34 32 20 27
## ENCFF358WYS ENCFF634AUL ENCFF677BPV ENCFF794QMH ENCFF532ZDE ENCFF003DBZ
## 30 22 28 15 29 15
## ENCFF954EHG ENCFF523MEO ENCFF669WDQ ENCFF997HBI ENCFF615ZTQ ENCFF336VTP
## 38 28 26 20 20 22
## ENCFF432ZGG ENCFF572OPZ ENCFF740SXP ENCFF504YJB ENCFF759PUL ENCFF512KYX
## 5 8 8 7 11 4
## ENCFF875HIA ENCFF143HKK ENCFF872ABP ENCFF226ILJ ENCFF718PJC ENCFF996EMC
## 10 20 50 48 38 43
## ENCFF538RNR ENCFF365SND ENCFF990GIB ENCFF658LQM ENCFF996YFF ENCFF421TRR
## 54 40 70 63 23 37
## ENCFF359ZOA ENCFF971KZC ENCFF168CVY ENCFF390KQM ENCFF422BJI ENCFF196WAD
## 57 59 15 20 24 31
## ENCFF453GZG ENCFF870YWY ENCFF706XGJ ENCFF835FSF ENCFF918YFP ENCFF052VJO
## 16 19 24 33 42 23
## ENCFF210MWH ENCFF793WMU ENCFF375JDR ENCFF298WHK ENCFF532DDR ENCFF627COG
## 45 35 8 24 11 20
## ENCFF502BTV ENCFF049EIV ENCFF513HAL ENCFF967SJG ENCFF037GWJ ENCFF365DLM
## 20 9 10 8 25 29
## ENCFF402XKL ENCFF918XYP ENCFF576YVC ENCFF480GNL ENCFF600ZQX ENCFF871WCS
## 20 31 23 25 22 35
## ENCFF050PAT ENCFF691EQW ENCFF972NMO ENCFF355MOU ENCFF288JNN ENCFF052DOQ
## 26 22 35 40 59 23
## ENCFF517XLT ENCFF434XNZ ENCFF356QFG ENCFF319WZT ENCFF385MJV ENCFF923YGS
## 41 46 15 9 25 16
# scale normalization using TMM method (default)
dge <- calcNormFactors(dge)
# convert counts to log2CPM
cpm_counts <- cpm(dge, log = TRUE, prior.count = 3)
# fit model with limma eBayes trend
fit <- lmFit(cpm_counts, design)
fit <- eBayes(fit, trend=TRUE)
# pulling specific contrasts
test_age <- topTable(fit, number = Inf, coef = "meta$dev_stageP0")
test_tissue <- topTable(fit, number = Inf, coef = "meta$tissue_typethymus")
#ggplot(data=test_age, aes(x = -log2(adj.P.Val))) + geom_histogram(binwidth = 0.1) + geom_vline(xintercept = -log(0.05), linetype="dotted", colour="red")
ggplot(data=test_age, aes(x = adj.P.Val)) + geom_histogram(binwidth = 0.01)
for (i in 1:5){
print(paste0("The ", i ," differentially expressed genes in dev stage 0: ", arrange(test_age, desc(logFC))[i,1], "name is: ", rownames(arrange(test_age, desc(logFC)))[i]))
}
## [1] "The 1 differentially expressed genes in dev stage 0: 13.4426598341768name is: mt-Atp6"
## [1] "The 2 differentially expressed genes in dev stage 0: 13.1775034528567name is: mt-Co1"
## [1] "The 3 differentially expressed genes in dev stage 0: 12.5060148981273name is: Hsd3b1"
## [1] "The 4 differentially expressed genes in dev stage 0: 12.3340004747955name is: mt-Co3"
## [1] "The 5 differentially expressed genes in dev stage 0: 12.2950949067205name is: Cyp21a1"
volcanoplot(fit,coef=5)